0. Data Import and Cleaning

Data Import

videogames.df <- read.csv(file.path(project.dir, dataset.dir, 'vgsales-12-4-2019.csv'))
colnames(videogames.df)
##  [1] "Rank"           "Name"           "basename"       "Genre"         
##  [5] "ESRB_Rating"    "Platform"       "Publisher"      "Developer"     
##  [9] "VGChartz_Score" "Critic_Score"   "User_Score"     "Total_Shipped" 
## [13] "Global_Sales"   "NA_Sales"       "PAL_Sales"      "JP_Sales"      
## [17] "Other_Sales"    "Year"           "Last_Update"    "url"           
## [21] "status"         "Vgchartzscore"  "img_url"

Data cleaning

# Since the data was collected in April of 2019, we are excluding games with year = 2019 since it does not give a comprehensive picture of all the sales during 2019. 
videogames.clean <- videogames.df %>% filter(Year < 2019)
# E was originally called KA for ESRB ratings, so we are going to make all the KA ratings E
videogames.clean <- videogames.clean %>% mutate(ESRB_Rating = replace(ESRB_Rating, ESRB_Rating=='KA', 'E'))
# Make give the ESRB rating levels for easier graphing/ data manipulation
unique(videogames.clean$ESRB_Rating)
## [1] "E"   ""    "M"   "E10" "T"   "RP"  "EC"  "AO"
videogames.clean$ESRB_Rating <- factor(videogames.clean$ESRB_Rating,levels = c('','RP','E', 'EC', 'E10','T','M','AO'))

Data reshaping

We want to compare sales across different regions, so it would be convenient to have one column ‘region’ and then a corresponding column for sales in USD (millions).

vs_byregion <- videogames.clean %>% gather(Region, Sales, Global_Sales:Other_Sales, na.rm = T)

1. Descriptive analysis

Conduct some descriptive analysis on the data, figuring out: * distributions of variables, * variables that appear to be strongly related with each other (using appropriate methods to quantify the relationships based on whether variables are numerical or categorical).

Distributions

Sales

From the boxplot we can see that we have 2 extreme outliers. After investigating, it looks like two outliers are GTA V (ps3 and ps4)

boxplot(videogames.clean$Global_Sales, xlab = 'Global Sales (millions of USD)')

videogames.clean[which(videogames.clean$Global_Sales > 17), ]
hist(videogames.clean$Global_Sales,
     xlab = 'Global Sales (millions of USD)',
     xlim = c(0, .5),
     breaks = 2000)

videogames.clean %>% arrange(desc(Global_Sales))

Platform

ggplotly(
videogames.clean %>% 
  count(Platform, sort = TRUE) %>% 
  ggplot(aes(x = reorder(Platform, -n), y = n)) +
  geom_bar(stat = "identity",position = position_dodge(width=0)) +
  theme(axis.text.x=element_text(angle=90,hjust=1, vjust = 0.5))
)

ESRB Rating

videogames.clean %>% ggplot(aes(x = ESRB_Rating)) +
  geom_bar()

Genre

videogames.clean %>% 
  count(Genre, sort = TRUE) %>% 
  ggplot(aes(x = reorder(Genre, -n), y = n)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x=element_text(angle=45,hjust=1))

Scoring

Here we looked at distribution of User Scores and Critic Scores as well as the average Critic and User Score over time.

videogames.clean %>% ggplot() +
  geom_histogram(binwidth = 0.5,aes(x = Critic_Score, fill = 'pink') ) +
  geom_histogram(binwidth = 0.5,aes(x = User_Score, fill = 'blue') ) 

Publisher

We have a ton of publishers

Year

videogames.clean %>% ggplot(aes(x = Year)) +
  geom_bar()

Notable Correlations

Sales by year

vs_sales.byregion.byyear <- vs_byregion %>% 
  group_by(Year, Region)  %>% 
  summarize(SSales = sum(Sales)) 
vs_sales.byregion.byyear$MSales <- vs_byregion %>% 
  group_by(Year, Region)  %>% 
  summarize(means = mean(Sales)) %>%
  pull(means)
vs_sales.byregion.byyear %>% ggplot(aes(x=Year))+
  geom_line(aes(y= SSales, color = Region))+
  geom_line(linetype = "dotted", aes(y= MSales*100, color = Region))

Sales per platform

Sales per rating

Sales per genre

ESRB Rating per year

Rating by year

videogames.clean %>% group_by(Year) %>% summarise(
  User_Score = mean(User_Score, na.rm = T),
  Critic_Score = mean(Critic_Score, na.rm = T), 
  Vgchartzscore = mean(Vgchartzscore, na.rm = T)) %>% 
  filter(Year >= 1989) %>%
  mutate(User_Score2 = case_when(Year >= 1996 ~ User_Score,
                           TRUE ~ NaN)) %>%
  gather(ScoreType, Score, c(User_Score,Critic_Score,Vgchartzscore), na.rm = T) %>%
  
  ggplot(aes(x = Year)) + # TODO : Make look better
  geom_line(aes(y = Score, color = ScoreType)) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5)) +
  scale_x_continuous('ID', labels = 1980:2018, breaks = 1980:2018) +
  xlab('ID') +
  xlim(1989, 2018
  )

Remove GTA V outliers

videogames.clean <- videogames.clean %>% filter(Global_Sales < 17 & !is.na(Global_Sales))

2. One sample stats inference for response variable of interest (sales)

⁃   construct CI for population mean value for sales

Total sample mean and CI

confidence <- 0.95
n <- length(videogames.clean$Global_Sales)
mu.hat.all <- mean(videogames.clean$Global_Sales)
sd.hat <- sd(videogames.clean$Global_Sales)

se <- sd.hat/sqrt(n)
alpha <- 1-confidence

CI <- c(mu.hat.all - qt(1-alpha/2, n-1)*se,
        mu.hat.all + qt(1-alpha/2, n-1)*se)

mu.hat.all
## [1] 0.3650039
CI
## [1] 0.3535682 0.3764396

One sample inference and CI (2008)
The test result is statistically significant with a p value of 0.1805, and 95% confidence interval of [0.3095337 0.3778467]. This means that on average, we are 95% confident that

# using all sample mean as population mean:
mu0 <- mu.hat.all
samp2008 <- videogames.clean %>% filter(Year == 2008 & !is.na(Global_Sales))  %>% pull(Global_Sales)
mu.hat <- mean(samp2008)
t.test(samp2008, mu = mu0)
## 
##  One Sample t-test
## 
## data:  samp2008
## t = -1.2239, df = 1671, p-value = 0.2212
## alternative hypothesis: true mean is not equal to 0.3650039
## 95 percent confidence interval:
##  0.3095337 0.3778467
## sample estimates:
## mean of x 
## 0.3436902

Check assumptions: * The sample is not randomized (vgchartz’s game database does not include all games and would have a bias towards including games that are available in english) * The population sales distribution is not normal at all (extreme right skew) * The dataset had two extreme outliers identified via boxplot, and removed.

Although the t-distribution CI is robust against non-normal populations, it is highly sensitive to violations of the random sampling assumption. Since our dataset would be missing a disproportionate amount of non-western games, and older games. So we likely have an undercoverage issue using the t-distribution method if we consider our population to be all video games that ever existed worldwide. But if we consider our population to be ???????????? then the CI we have is trustworthy

We can also use bootstrap to estimate the 95% CI for the mean of video game sales. I would not expect this result to be significantly different, or better than the t-distribution method because bootstrapping is also sensitive to non-random sampling, because the assumption is that our sample is a good representation of the population we are interested in.

3. Construct confidence interval for proportion of video games in a certain genre (Action) (conduct hypothesis test against pre-specified constant)

Here we conducted a t test with 95% confidence intervals looking at sales for the Action Genre. We ignored any NA values and made sure to exclude any extreme outliers such as GTA V. H Null is Action Video Games mean sales = 0.367 and Alternative Hypothesis being Action Video Games mean sales != .367. We can see that the mean video game sales does not fall into our 95% confidence intervals for Action game sales. Thus, we reject the null hypothesis.

#Ignored all NA values

actions <- videogames.clean %>% filter(Genre == 'Action')

actionconf <- t.test(actions$Global_Sales, mu=0.365, conf.level = 0.95)

actionconf
## 
##  One Sample t-test
## 
## data:  actions$Global_Sales
## t = 2.2127, df = 2891, p-value = 0.02699
## alternative hypothesis: true mean is not equal to 0.365
## 95 percent confidence interval:
##  0.3693669 0.4373343
## sample estimates:
## mean of x 
## 0.4033506
# 0.3693669 0.4373343

#Null Hypothesis would be that mean video game sales (.365) falls between .369 and .437

avgsales<-mean(videogames.clean$Global_Sales,na.rm=T)
avgsales
## [1] 0.3650039
#0.365

#p-value = 0.02699

#Mean does not fall into 95% confidence interval so we reject Null Hypothesis.

#Cohen.d(t.test, data= videogames.clean)

# Cohen's effect size
abs(mean(actions$Global_Sales) - mean(videogames.clean$Global_Sales)) / sqrt((sd(actions$Global_Sales)^2 + (sd(videogames.clean$Global_Sales)^2)) / 2)
## [1] 0.04389653
#0.04389653

5. We will (c) construct a confidence interval for population correlation between critic score and sales using bootstrap.